import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("house_data.csv")
df.info()
Split dataset into half.
splitting_int = int(len(df)*0.5)
df1 = df.iloc[:splitting_int, :]
df2 = df.iloc[splitting_int:,:]
print(df1.shape)
print(df2.shape)
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
def elbow_plot(data=df1[['price','sqft_living']]):
score = []
for cluster in range(1,10):
kmeans = KMeans(n_clusters = cluster, init="k-means++", random_state=10)
kmeans.fit(data)
score.append(kmeans.inertia_)
plt.bar(range(1,10), score)
plt.title('The Elbow Method')
plt.xlabel('no of clusters')
plt.ylabel('wcss')
plt.grid()
return plt.show()
elbow_plot()
scaler = MinMaxScaler()
df1[['price','sqft_living']] = scaler.fit_transform(df1[['price','sqft_living']])
#Apply kmeans clustering to the entire dataset
kmeans = KMeans(n_clusters = 5, random_state = 1000).fit(df1[['price','sqft_living']])
df1['cluster'] = kmeans.labels_
df1[['price','sqft_living']] = scaler.inverse_transform(df1[['price','sqft_living']])
import pickle
pickle.dump(kmeans, open("save.pkl", "wb"))
km = pickle.load(open("save.pkl", "rb"))
scaler2 = MinMaxScaler()
df2[['price','sqft_living']] = scaler.fit_transform(df2[['price','sqft_living']])
km.fit(df2[['price','sqft_living']])
df2['cluster'] = km.labels_
df2[['price','sqft_living']] = scaler.inverse_transform(df2[['price','sqft_living']])
def cluster_summary(data):
#Descriptive analysis of clusters of a summary
data=data.drop(['id','lat','long','yr_built','yr_renovated','zipcode'],axis=1)
count = data.groupby('cluster')['cluster'].count()
# find cluster averages
cluster_means = np.round(data.groupby(['cluster']).mean(),decimals=3)
cluster_means['Count'] = count
return np.transpose(cluster_means)
cluster_summary(df1)
cluster_summary(df2)
import plotly.express as px
hoverdata_list=['floors','grade','view','waterfront','lat','long','yr_built']
#Plot a plotly interactive scatter plot for df1
fig = px.scatter(df1, y='price', x='sqft_living',
color='cluster', render_mode='svg', template='plotly',
hover_data=hoverdata_list,
hover_name='id')
fig.show()
#Plot a plotly interactive scatter plot for df2
fig = px.scatter(df2, y='price', x='sqft_living',
color='cluster', render_mode='svg', template='plotly',
hover_data=hoverdata_list,
hover_name='id')
fig.show()
fig = px.scatter_mapbox(df1, lat="lat", lon="long", color="cluster",hover_name='id',hover_data=hoverdata_list,
color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10,
mapbox_style="carto-positron")
fig.show()
fig = px.scatter_mapbox(df2, lat="lat", lon="long", color="yr_built", hover_name='id',hover_data=hoverdata_list,
color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10,
mapbox_style="carto-positron")
fig.show()
fig = px.scatter_mapbox(df2, lat="lat", lon="long", color="grade", hover_name='id',hover_data=hoverdata_list,
color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10,
mapbox_style="carto-positron")
fig.show()
fig = px.scatter_ternary(df1, a="bedrooms", b="bathrooms", c="floors",color='cluster',hover_name='id',size='price')
fig.show()
fig = px.scatter_ternary(df2, a="bedrooms", b="bathrooms", c="floors",color='cluster',hover_name='id',size='price')
fig.show()